### Load required libraries
library(readxl)
library(tidyverse)
library(dplyr)
library(knitr)
library(DT)
library(httr)
library(jsonlite)

### Load functions
source(file = "../R/02_functions.R")

### Define paths
current_dir <- getwd()

# MAF-like file
maf_data <- "../data/_raw/41467_2017_1460_MOESM6_ESM_somatic_mutations.xlsx"
maf_path <- file.path(current_dir, 
                      maf_data)

### Read data
maf_df <- read_excel(maf_path,
                        skip=1,
                        col_names=TRUE)

0.1 See mutations per patient

unique_tumor_counts <- maf_df %>%
  dplyr::count(tumor_name, 
        sort = TRUE)  # Count unique values and sort

# Display the result
datatable(unique_tumor_counts, 
          extensions = 'Buttons', 
          options = list(
            dom = 'Bfrtip',
            buttons = c('copy', 'excel', 'csv'),
            scrollX=TRUE,
            pageLength=10,
            columnDefs = list(list(
              targets = "_all",
              render = JS(
                "function(data, type, row, meta) {",
                "  return data === null ? 'NA' : data;",
                "}"
              )
            ))
          ),
          caption = "Mutation counts per patient"
        )

0.2 Pre-processing

# Filter rows where Entrez_Gene_Id is 0
missing_entrez_df <- maf_df %>%
  filter(Entrez_Gene_Id == 0)

# Filter rows where Hugo_Symbol is Unknown
missing_hugo_df <- maf_df %>%
  filter(Hugo_Symbol == 'Unknown')

n_rows_1 <- nrow(missing_entrez_df)
n_rows_2 <- nrow(missing_hugo_df)

print(paste("Number of genomic regions missing an Entrez ID:", n_rows_1))
## [1] "Number of genomic regions missing an Entrez ID: 172"
print(paste("Number of genomic regions missing Hugo Symbol:", n_rows_2))
## [1] "Number of genomic regions missing Hugo Symbol: 957"

0.3 Get the Hugo Symbols & Ensembl Gene IDs from Entrez IDs

# Connect to the Ensembl BioMart database
ensembl = biomaRt::useMart(biomart="ENSEMBL_MART_ENSEMBL", 
                  host="https://grch37.ensembl.org", 
                  path="/biomart/martservice",
                  dataset="hsapiens_gene_ensembl")

# Extracting non-NA and non-zero values and ensuring they are unique
entrez_ids <- unique(na.omit(maf_df$Entrez_Gene_Id[maf_df$Entrez_Gene_Id != 0]))

# Ensure that entrez_ids is a character vector
entrez_ids <- as.character(entrez_ids)

# Retrieve Hugo symbols & Ensembl gene ids
reannotated_df <- biomaRt::getBM(attributes = c("entrezgene_id", 
                                                "hgnc_symbol", 
                                                "ensembl_gene_id"),
                   filters = "entrezgene_id",
                   values = entrez_ids,
                   mart = ensembl)

# Rename columns so that they match maf_df
# reannotated_df <- reannotated_df %>%
#   dplyr::rename(
#     Entrez_Gene_Id = entrezgene_id,
#     Ensembl_Gene_Id = ensembl_gene_id
#   )

# Left join reannotated_df with maf_df
maf_df_updated <- maf_df %>%
  left_join(reannotated_df, by = c("Entrez_Gene_Id" = "entrezgene_id"))
## Warning in left_join(., reannotated_df, by = c(Entrez_Gene_Id = "entrezgene_id")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 6 of `x` matches multiple rows in `y`.
## ℹ Row 3125 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# View the updated dataframe
DT::datatable(maf_df_updated, 
          extensions = c('Buttons'),
          options = list(
            dom = 'Bfrtip',
            buttons = c('copy', 'excel', 'csv'),
            scrollX=TRUE,
            pageLength=10,
            columnDefs = list(list(
              targets = "_all",
              render = JS(
                "function(data, type, row, meta) {",
                "  return data === null ? 'NA' : data;",
                "}"
              )
            ))
          ),
          caption = "MAF_df reannotated"
        )
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
# Count the number of unique values in the ensembl_gene_id column
num_unique_values <- reannotated_df %>% 
  summarise(unique_count = n_distinct(entrezgene_id)) %>% 
  pull(unique_count)

# Print the number of unique values
print(paste("Number of unique Entrez Gene IDs in reannotated_df:", num_unique_values))
## [1] "Number of unique Entrez Gene IDs in reannotated_df: 6629"
# Search for a particular Entrez_Gene_Id
reannotated_df %>% 
  filter(entrezgene_id == "100526835")
##   entrezgene_id hgnc_symbol ensembl_gene_id
## 1     100526835 FPGT-TNNI3K ENSG00000259030
## 2     100526835      TNNI3K ENSG00000116783

0.4 Export info to use Ensembl VEP (web version)

### Extract info in correct format for VEP input
api_input <- maf_df %>% 
  select(Chromosome,
         Start_position,
         End_position,
         ref_allele,
         alt_allele,
         Strand) %>% 
  mutate(Allele_ref_alt = paste(ref_allele, 
                                alt_allele, 
                                sep = "/")) %>% 
  mutate(API_info = paste(Chromosome,
                          Start_position,
                          End_position,
                          Allele_ref_alt,
                          Strand,
                          sep = " "))

### Export the info to a file and use the Web VEP
vep_input_data <- "../data/vep_input_grch37.txt"
vep_file_path <- file.path(current_dir, 
                      vep_input_data)
write.table(api_input$API_info, file = vep_file_path, row.names = FALSE, col.names = FALSE, quote = FALSE)

0.5 Run VEP and explore output

### Define path
vep_out_data <- "../data/vep_output_grch37_01.txt"
vep_path <- file.path(current_dir,
                      vep_out_data)

### Read data
vep_df <- read.csv(vep_path, header = TRUE, sep = "\t")

### Display result
datatable(vep_df, 
          extensions = c('Buttons'),
          options = list(
            dom = 'Bfrtip',
            buttons = c('copy', 'excel', 'csv'),
            scrollX=TRUE,
            pageLength=10,
            columnDefs = list(list(
              targets = "_all",
              render = JS(
                "function(data, type, row, meta) {",
                "  return data === null ? 'NA' : data;",
                "}"
              )
            ))
          ),
          caption = "VEP output"
        )

0.6 Session Info

sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: x86_64-apple-darwin20 (64-bit)
## Running under: macOS Monterey 12.7.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/Copenhagen
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] jsonlite_1.8.8  httr_1.4.7      DT_0.31         knitr_1.45     
##  [5] lubridate_1.9.3 forcats_1.0.0   stringr_1.5.1   dplyr_1.1.4    
##  [9] purrr_1.0.2     readr_2.1.5     tidyr_1.3.1     tibble_3.2.1   
## [13] ggplot2_3.4.4   tidyverse_2.0.0 readxl_1.4.3   
## 
## loaded via a namespace (and not attached):
##  [1] KEGGREST_1.42.0         gtable_0.3.4            xfun_0.41              
##  [4] bslib_0.6.1             htmlwidgets_1.6.4       Biobase_2.62.0         
##  [7] tzdb_0.4.0              bitops_1.0-7            vctrs_0.6.5            
## [10] tools_4.3.2             crosstalk_1.2.1         generics_0.1.3         
## [13] curl_5.2.0              stats4_4.3.2            fansi_1.0.6            
## [16] AnnotationDbi_1.64.1    RSQLite_2.3.5           blob_1.2.4             
## [19] pkgconfig_2.0.3         dbplyr_2.4.0            S4Vectors_0.40.2       
## [22] GenomeInfoDbData_1.2.11 lifecycle_1.0.4         compiler_4.3.2         
## [25] progress_1.2.3          Biostrings_2.70.1       munsell_0.5.0          
## [28] GenomeInfoDb_1.38.5     htmltools_0.5.7         sass_0.4.8             
## [31] RCurl_1.98-1.14         yaml_2.3.8              crayon_1.5.2           
## [34] pillar_1.9.0            jquerylib_0.1.4         ellipsis_0.3.2         
## [37] cachem_1.0.8            tidyselect_1.2.0        digest_0.6.34          
## [40] stringi_1.8.3           biomaRt_2.58.1          fastmap_1.1.1          
## [43] grid_4.3.2              colorspace_2.1-0        cli_3.6.2              
## [46] magrittr_2.0.3          XML_3.99-0.16.1         utf8_1.2.4             
## [49] withr_3.0.0             rappdirs_0.3.3          filelock_1.0.3         
## [52] prettyunits_1.2.0       scales_1.3.0            bit64_4.0.5            
## [55] timechange_0.3.0        XVector_0.42.0          rmarkdown_2.25         
## [58] bit_4.0.5               cellranger_1.1.0        png_0.1-8              
## [61] hms_1.1.3               memoise_2.0.1           evaluate_0.23          
## [64] IRanges_2.36.0          BiocFileCache_2.10.1    rlang_1.1.3            
## [67] glue_1.7.0              DBI_1.2.1               xml2_1.3.6             
## [70] BiocGenerics_0.48.1     rstudioapi_0.15.0       R6_2.5.1               
## [73] zlibbioc_1.48.0